R Markdown

library(dplyr)
library(reshape2)
library(factoextra) #fviz_eig
library(tidyverse)
library(stringr)

Gene_PA_df <- read.table("/data02/Analysis/Projects/2_CPE_Transmission/Tracking_Plasmid_Change/RoaryPCA__plasmids/gene_presence_absence.Rtab", quote = "", check.names = FALSE, header = TRUE, sep = "\t")
#head(Gene_PA_df)
#View(as.data.frame(Gene_PA_df))

# Transpose to samples in rows and genes in columns
Gene_PA_df_t  <- data.table::transpose(Gene_PA_df, keep.names = "Plasmid", make.names = "Gene")   
#View(Gene_PA_df_t)

# Load metadata file
Gene_PA_meta <- read.table("/data02/Analysis/Projects/2_CPE_Transmission/Tracking_Plasmid_Change/RoaryPCA__plasmids/CPE_Transmission_959_plasmids_complete.csv", check.names = FALSE, header = TRUE, sep = ",", fill = TRUE)
#head(Gene_PA_meta)
#tail(Gene_PA_meta)

# Combine the Matrix with Metadata
Gene_PA_df_with_metadata <- left_join(Gene_PA_df_t, Gene_PA_meta, by = c("Plasmid"))
#View(Gene_PA_df_with_metadata)
#head(Gene_PA_df_with_metadata)
#tail(Gene_PA_df_with_metadata)
#write.csv(Gene_PA_df_with_metadata,"metadata_matrix.csv",col.names = TRUE)


Gene_PA_df_with_metadata$year=format(as.Date(Gene_PA_df_with_metadata$date_of_culture, format="%d/%m/%Y"),"%Y")

# remove na in r - remove rows - na.omit function / option
#Gene_PA_df_with_metadata.withoutNA <- na.omit(Gene_PA_df_with_metadata) 
Gene_PA_df_with_metadata <- Gene_PA_df_with_metadata %>%  
  mutate(lengthbin = case_when(Plasmid_length_bp<=10000 ~ "<10K",
                               Plasmid_length_bp>=10000 & Plasmid_length_bp<=39999 ~ "10K-39.9K",
                               Plasmid_length_bp>=40000 & Plasmid_length_bp<=49999 ~ "40K-49.9K",
                               Plasmid_length_bp>=50000 & Plasmid_length_bp<=59999 ~ "40K-49.9K",
                               Plasmid_length_bp>=60000 & Plasmid_length_bp<=69999 ~ "60K-69.9K",
                               Plasmid_length_bp>=70000 & Plasmid_length_bp<=79999 ~ "70K-79.9K",
                               Plasmid_length_bp>=80000 & Plasmid_length_bp<=89999 ~ "80K-89.9K",
                               Plasmid_length_bp>=90000 & Plasmid_length_bp<=99999 ~ "90K-99.9K",
                               Plasmid_length_bp>=100000 & Plasmid_length_bp<=199999 ~ "100K-199.9K",
                               Plasmid_length_bp>=200000 ~ ">200K",
  ))

#View(Gene_PA_df_with_metadata)


# Convert first column to rownames
roaryTab.df  <- Gene_PA_df_with_metadata %>% filter(!str_detect(merged_Inc, 'multipleInc|rep_cluster_1195|IncFIB')) %>% remove_rownames %>% column_to_rownames(var="Plasmid")
#head(roaryTab.df)
#Gene_PA_df_with_metadata %>% filter(merged_Inc != "multipleInc" |merged_Inc != "rep_cluster_1195" |merged_Inc != "IncFIB") %>% View()

#write.csv(roaryTab.df,"/data02/Analysis/Projects/2_CPE_Transmission/Tracking_Plasmid_Change/RoaryPCA__plasmids/metadata_matrix.csv")
# remove last 15 metadata columns for prcomp
# View(roaryTab.df[1:(length(roaryTab.df)-15)]) 

#---------------> SCALE TRUE
# Run PCA
#res.pca.scaleT  <- prcomp(roaryTab.df[1:(length(roaryTab.df)-15)], scale = TRUE)

options(ggrepel.max.overlaps = 30)
#fviz_pca_ind(res.pca.scaleT)
#fviz_pca_ind(res.pca.scaleT,fill.ind = roaryTab.df$Hospital_type)
#fviz_pca_ind(res.pca.scaleT,fill.ind = res.pca.scaleT$year)
#fviz_pca_ind(res.pca.scaleT, label="none", habillage=roaryTab.df$Hospital_type, addEllipses=TRUE, ellipse.level=0.95)

#fviz_pca_ind(res.pca.scaleT, fill.ind = roaryTab.df$Plasmid_CP_gene,pointshape = 21, pointsize = 3,addEllipses=TRUE, ellipse.level=0.95, repel = TRUE)


#---------------> SCALE FALSE

# Run PCA
res.pca.scaleF  <- prcomp(roaryTab.df[1:(length(roaryTab.df)-17)], scale = FALSE)

options(ggrepel.max.overlaps = 0)
#fviz_pca_ind(res.pca.scaleT)
#fviz_pca_ind(res.pca.scaleT,fill.ind = roaryTab.df$Hospital_type)
#fviz_pca_ind(res.pca.scaleT,fill.ind = res.pca.scaleT$year)
#fviz_pca_ind(res.pca.scaleF, label="none", habillage=roaryTab.df$Hospital_type, addEllipses=TRUE, ellipse.level=0.95)

# fviz_pca_ind(res.pca.scaleF, fill.ind = roaryTab.df$Plasmid_CP_gene,pointshape = 21, pointsize = 3,addEllipses=TRUE, ellipse.level=0.95, repel = TRUE)
# 
# fviz_pca_ind(res.pca.scaleF, fill.ind = roaryTab.df$lengthbin,pointshape = 21, pointsize = 3,addEllipses=TRUE, ellipse.level=0.95, repel = TRUE)
# 
# fviz_pca_ind(res.pca.scaleF, fill.ind = roaryTab.df$year,pointshape = 21, pointsize = 3,addEllipses=TRUE, ellipse.level=0.95, repel = TRUE)
# 
# fviz_pca_ind(res.pca.scaleF, fill.ind = roaryTab.df$Genomic_species,pointshape = 21, pointsize = 3,addEllipses=FALSE, ellipse.level=0.95, repel = TRUE)
# 
# fviz_pca_ind(res.pca.scaleF, fill.ind = roaryTab.df$Genomic_species,pointshape = 21, pointsize = 3,addEllipses=FALSE, ellipse.level=0.95, repel = TRUE)
# 
# fviz_pca_ind(res.pca.scaleF, fill.ind = roaryTab.df$Genomic_species,pointshape = 21, pointsize = 3,addEllipses=FALSE, ellipse.level=0.95, repel = TRUE)
# 
# fviz_pca_ind(res.pca.scaleF, fill.ind = roaryTab.df$Plasmid_CP_gene,pointshape = 21, pointsize = 3,addEllipses=FALSE, ellipse.level=0.95, repel = TRUE)

#######------------Multiple labels---------###########
basic_plot <- fviz_pca_ind(res.pca.scaleF, label="none")
#View(head(basic_plot$data))
#tail(basic_plot$data)

# Combine the Matrix with Metadata
basic_plot_with_meta <- left_join(basic_plot$data, Gene_PA_meta, by = c("name" = "Plasmid"))
#View((basic_plot_with_meta))

# ggplot(basic_plot_with_meta,
#        aes(x=x,y=y,col=Plasmid_CP_gene,shape=factor(Hospital_type))) + geom_point(size=5) + theme_bw()
# 
# ggplot(basic_plot_with_meta,
#        aes(x=x,y=y,col=Plasmid_CP_gene,shape=factor(Hospital_type))) + geom_point(size=2, alpha = 0.3) + theme_bw()

ggplot(basic_plot_with_meta,
       aes(x=x,y=y,col=Plasmid_CP_gene,shape=Inc_type)) + geom_point(size=2, alpha = 0.3) + theme_bw()

ggplot(basic_plot_with_meta,
       aes(x=x,y=y,col=Plasmid_CP_gene,shape=merged_Inc)) + geom_point(size=2, alpha = 0.3) + theme_bw()

library(plotly)

# CP gene and Hospital

p1 <- ggplot(basic_plot_with_meta,
       aes(x=x,y=y,col=Plasmid_CP_gene,shape=factor(Hospital_type))) + geom_point(size=2, alpha = 0.3) + theme_bw()
ggplotly(p1)
p2 <- ggplot(basic_plot_with_meta,
             aes(x=x,y=y,col=Plasmid_CP_gene,shape=Genomic_species)) + geom_point(size=2, alpha = 0.3) + theme_bw()
ggplotly(p2)
## Warning: The shape palette can deal with a maximum of 6 discrete values because
## more than 6 becomes difficult to discriminate; you have 12. Consider
## specifying shapes manually if you must have them.
p3 <- ggplot(basic_plot_with_meta,
             aes(x=x,y=y,col=Plasmid_CP_gene,shape=merged_Inc)) + geom_point(size=2, alpha = 0.3) + theme_bw()
ggplotly(p3)